import pandas as pd
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
os.chdir('C:\\Users\\lance\\OneDrive\\Desktop\\DATA science\\')
df_train=pd.read_csv('train_v9rqX0R.csv')
df_test=pd.read_csv('test_AbJTz2l.csv')
df_train.head()
| Item_Identifier | Item_Weight | Item_Fat_Content | Item_Visibility | Item_Type | Item_MRP | Outlet_Identifier | Outlet_Establishment_Year | Outlet_Size | Outlet_Location_Type | Outlet_Type | Item_Outlet_Sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | FDA15 | 9.30 | Low Fat | 0.016047 | Dairy | 249.8092 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 3735.1380 |
| 1 | DRC01 | 5.92 | Regular | 0.019278 | Soft Drinks | 48.2692 | OUT018 | 2009 | Medium | Tier 3 | Supermarket Type2 | 443.4228 |
| 2 | FDN15 | 17.50 | Low Fat | 0.016760 | Meat | 141.6180 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 2097.2700 |
| 3 | FDX07 | 19.20 | Regular | 0.000000 | Fruits and Vegetables | 182.0950 | OUT010 | 1998 | NaN | Tier 3 | Grocery Store | 732.3800 |
| 4 | NCD19 | 8.93 | Low Fat | 0.000000 | Household | 53.8614 | OUT013 | 1987 | High | Tier 3 | Supermarket Type1 | 994.7052 |
df_train.describe()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Item_Outlet_Sales | |
|---|---|---|---|---|---|
| count | 7060.000000 | 8523.000000 | 8523.000000 | 8523.000000 | 8523.000000 |
| mean | 12.857645 | 0.066132 | 140.992782 | 1997.831867 | 2181.288914 |
| std | 4.643456 | 0.051598 | 62.275067 | 8.371760 | 1706.499616 |
| min | 4.555000 | 0.000000 | 31.290000 | 1985.000000 | 33.290000 |
| 25% | 8.773750 | 0.026989 | 93.826500 | 1987.000000 | 834.247400 |
| 50% | 12.600000 | 0.053931 | 143.012800 | 1999.000000 | 1794.331000 |
| 75% | 16.850000 | 0.094585 | 185.643700 | 2004.000000 | 3101.296400 |
| max | 21.350000 | 0.328391 | 266.888400 | 2009.000000 | 13086.964800 |
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8523 entries, 0 to 8522 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Item_Identifier 8523 non-null object 1 Item_Weight 7060 non-null float64 2 Item_Fat_Content 8523 non-null object 3 Item_Visibility 8523 non-null float64 4 Item_Type 8523 non-null object 5 Item_MRP 8523 non-null float64 6 Outlet_Identifier 8523 non-null object 7 Outlet_Establishment_Year 8523 non-null int64 8 Outlet_Size 6113 non-null object 9 Outlet_Location_Type 8523 non-null object 10 Outlet_Type 8523 non-null object 11 Item_Outlet_Sales 8523 non-null float64 dtypes: float64(4), int64(1), object(7) memory usage: 799.2+ KB
missing_val_train=df_train.isna().sum()
df_test.describe()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | |
|---|---|---|---|---|
| count | 4705.000000 | 5681.000000 | 5681.000000 | 5681.000000 |
| mean | 12.695633 | 0.065684 | 141.023273 | 1997.828903 |
| std | 4.664849 | 0.051252 | 61.809091 | 8.372256 |
| min | 4.555000 | 0.000000 | 31.990000 | 1985.000000 |
| 25% | 8.645000 | 0.027047 | 94.412000 | 1987.000000 |
| 50% | 12.500000 | 0.054154 | 141.415400 | 1999.000000 |
| 75% | 16.700000 | 0.093463 | 186.026600 | 2004.000000 |
| max | 21.350000 | 0.323637 | 266.588400 | 2009.000000 |
df_test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5681 entries, 0 to 5680 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Item_Identifier 5681 non-null object 1 Item_Weight 4705 non-null float64 2 Item_Fat_Content 5681 non-null object 3 Item_Visibility 5681 non-null float64 4 Item_Type 5681 non-null object 5 Item_MRP 5681 non-null float64 6 Outlet_Identifier 5681 non-null object 7 Outlet_Establishment_Year 5681 non-null int64 8 Outlet_Size 4075 non-null object 9 Outlet_Location_Type 5681 non-null object 10 Outlet_Type 5681 non-null object dtypes: float64(3), int64(1), object(7) memory usage: 488.3+ KB
missing_val_test=df_test.isna().sum()
#Missing %
missing_percent_train=missing_val_train/len(df_train)
missing_percent_train
Item_Identifier 0.000000 Item_Weight 0.171653 Item_Fat_Content 0.000000 Item_Visibility 0.000000 Item_Type 0.000000 Item_MRP 0.000000 Outlet_Identifier 0.000000 Outlet_Establishment_Year 0.000000 Outlet_Size 0.282764 Outlet_Location_Type 0.000000 Outlet_Type 0.000000 Item_Outlet_Sales 0.000000 dtype: float64
missing_percent_test=missing_val_test/len(df_train)
missing_percent_test
Item_Identifier 0.000000 Item_Weight 0.114514 Item_Fat_Content 0.000000 Item_Visibility 0.000000 Item_Type 0.000000 Item_MRP 0.000000 Outlet_Identifier 0.000000 Outlet_Establishment_Year 0.000000 Outlet_Size 0.188431 Outlet_Location_Type 0.000000 Outlet_Type 0.000000 dtype: float64
df_train.Item_Identifier.value_counts()
FDW13 10
FDG33 10
FDF52 9
FDX20 9
FDW49 9
..
FDC23 1
FDY43 1
FDE52 1
DRF48 1
FDO33 1
Name: Item_Identifier, Length: 1559, dtype: int64
df_train.Item_Type.value_counts()
Fruits and Vegetables 1232 Snack Foods 1200 Household 910 Frozen Foods 856 Dairy 682 Canned 649 Baking Goods 648 Health and Hygiene 520 Soft Drinks 445 Meat 425 Breads 251 Hard Drinks 214 Others 169 Starchy Foods 148 Breakfast 110 Seafood 64 Name: Item_Type, dtype: int64
#Outlet Sales Vs ItemType sales
px.histogram(df_train, x='Item_Type',y='Item_Outlet_Sales', color='Item_Type')
px.histogram(df_train, x='Outlet_Type',y='Item_Outlet_Sales',title='Outlet Sale vs Outlet Type',color='Outlet_Type')
df_train.Item_Weight.value_counts()
df_train
| Item_Identifier | Item_Weight | Item_Fat_Content | Item_Visibility | Item_Type | Item_MRP | Outlet_Identifier | Outlet_Establishment_Year | Outlet_Size | Outlet_Location_Type | Outlet_Type | Item_Outlet_Sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | FDA15 | 9.300 | Low Fat | 0.016047 | Dairy | 249.8092 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 3735.1380 |
| 1 | DRC01 | 5.920 | Regular | 0.019278 | Soft Drinks | 48.2692 | OUT018 | 2009 | Medium | Tier 3 | Supermarket Type2 | 443.4228 |
| 2 | FDN15 | 17.500 | Low Fat | 0.016760 | Meat | 141.6180 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 2097.2700 |
| 3 | FDX07 | 19.200 | Regular | 0.000000 | Fruits and Vegetables | 182.0950 | OUT010 | 1998 | NaN | Tier 3 | Grocery Store | 732.3800 |
| 4 | NCD19 | 8.930 | Low Fat | 0.000000 | Household | 53.8614 | OUT013 | 1987 | High | Tier 3 | Supermarket Type1 | 994.7052 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8518 | FDF22 | 6.865 | Low Fat | 0.056783 | Snack Foods | 214.5218 | OUT013 | 1987 | High | Tier 3 | Supermarket Type1 | 2778.3834 |
| 8519 | FDS36 | 8.380 | Regular | 0.046982 | Baking Goods | 108.1570 | OUT045 | 2002 | NaN | Tier 2 | Supermarket Type1 | 549.2850 |
| 8520 | NCJ29 | 10.600 | Low Fat | 0.035186 | Health and Hygiene | 85.1224 | OUT035 | 2004 | Small | Tier 2 | Supermarket Type1 | 1193.1136 |
| 8521 | FDN46 | 7.210 | Regular | 0.145221 | Snack Foods | 103.1332 | OUT018 | 2009 | Medium | Tier 3 | Supermarket Type2 | 1845.5976 |
| 8522 | DRG01 | 14.800 | Low Fat | 0.044878 | Soft Drinks | 75.4670 | OUT046 | 1997 | Small | Tier 1 | Supermarket Type1 | 765.6700 |
8523 rows × 12 columns
#Missing Data Imputation
df_train['Outlet_Size'].fillna(df_train['Outlet_Size'].mode()[0],inplace=True)
df_train.Outlet_Size.isna().sum()
0
px.scatter(df_train.sample(2000),
title='Item Visibilty vs Item Sale',
x='Item_Visibility',
y='Item_Outlet_Sales',
)
px.histogram(df_train, x='Outlet_Size',y='Item_Outlet_Sales',title='Outlet Size vs Outlet Sales',color='Outlet_Size')
df_train[['Item_Fat_Content','Item_Type','Outlet_Identifier','Outlet_Size','Outlet_Location_Type','Outlet_Type']].astype('category')
| Item_Fat_Content | Item_Type | Outlet_Identifier | Outlet_Size | Outlet_Location_Type | Outlet_Type | |
|---|---|---|---|---|---|---|
| 0 | Low Fat | Dairy | OUT049 | Medium | Tier 1 | Supermarket Type1 |
| 1 | Regular | Soft Drinks | OUT018 | Medium | Tier 3 | Supermarket Type2 |
| 2 | Low Fat | Meat | OUT049 | Medium | Tier 1 | Supermarket Type1 |
| 3 | Regular | Fruits and Vegetables | OUT010 | Medium | Tier 3 | Grocery Store |
| 4 | Low Fat | Household | OUT013 | High | Tier 3 | Supermarket Type1 |
| ... | ... | ... | ... | ... | ... | ... |
| 8518 | Low Fat | Snack Foods | OUT013 | High | Tier 3 | Supermarket Type1 |
| 8519 | Regular | Baking Goods | OUT045 | Medium | Tier 2 | Supermarket Type1 |
| 8520 | Low Fat | Health and Hygiene | OUT035 | Small | Tier 2 | Supermarket Type1 |
| 8521 | Regular | Snack Foods | OUT018 | Medium | Tier 3 | Supermarket Type2 |
| 8522 | Low Fat | Soft Drinks | OUT046 | Small | Tier 1 | Supermarket Type1 |
8523 rows × 6 columns
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8523 entries, 0 to 8522 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Item_Identifier 8523 non-null object 1 Item_Weight 7060 non-null float64 2 Item_Fat_Content 8523 non-null object 3 Item_Visibility 8523 non-null float64 4 Item_Type 8523 non-null object 5 Item_MRP 8523 non-null float64 6 Outlet_Identifier 8523 non-null object 7 Outlet_Establishment_Year 8523 non-null int64 8 Outlet_Size 8523 non-null object 9 Outlet_Location_Type 8523 non-null object 10 Outlet_Type 8523 non-null object 11 Item_Outlet_Sales 8523 non-null float64 dtypes: float64(4), int64(1), object(7) memory usage: 799.2+ KB
object_col=[col for col in df_train.columns if df_train[col].dtype=='object']
df_train['Item_Identifier']=df_train['Item_Identifier'].astype('category')
df_train['Item_Fat_Content']=df_train['Item_Fat_Content'].astype('category')
df_train['Item_Type']=df_train['Item_Type'].astype('category')
df_train['Outlet_Identifier']=df_train['Outlet_Identifier'].astype('category')
df_train['Outlet_Size']=df_train['Outlet_Size'].astype('category')
df_train['Outlet_Location_Type']=df_train['Outlet_Location_Type'].astype('category')
df_train['Outlet_Type']=df_train['Outlet_Type'].astype('category')
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8523 entries, 0 to 8522 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Item_Identifier 8523 non-null category 1 Item_Weight 7060 non-null float64 2 Item_Fat_Content 8523 non-null category 3 Item_Visibility 8523 non-null float64 4 Item_Type 8523 non-null category 5 Item_MRP 8523 non-null float64 6 Outlet_Identifier 8523 non-null category 7 Outlet_Establishment_Year 8523 non-null int64 8 Outlet_Size 8523 non-null category 9 Outlet_Location_Type 8523 non-null category 10 Outlet_Type 8523 non-null category 11 Item_Outlet_Sales 8523 non-null float64 dtypes: category(7), float64(4), int64(1) memory usage: 453.5 KB
px.scatter(df_train,x='Item_MRP',y='Item_Outlet_Sales',title='Item MRP vs Outlet Sales',color='Item_MRP')
px.scatter(df_train,x='Outlet_Establishment_Year',y='Item_Outlet_Sales',title='Outlet_Establishment_Year vs Outlet Sales',color='Outlet_Establishment_Year')
px.histogram(df_train,x='Item_Fat_Content',y='Item_Outlet_Sales',title='Item_Fat_Content vs Outlet Sales',color='Item_Fat_Content')
fig = px.histogram(df_train,
x='Item_Weight',
marginal='box',
nbins=17,
title='Distribution of Item_Weight')
fig.update_layout(bargap=0.1)
fig.show()
fig = px.histogram(df_train,
x='Item_Visibility',
marginal='box',
nbins=32,
title='Distribution of Item_Visibility')
fig.update_layout(bargap=0.1)
fig.show()
fig = px.histogram(df_train,
x='Item_MRP',
marginal='box',
nbins=32,
title='Distribution of Item_MRP')
fig.update_layout(bargap=0.1)
fig.show()
fig = px.histogram(df_train,
x='Item_Outlet_Sales',
marginal='box',
nbins=32,
title='Distribution of Item_Outlet_Sales')
fig.update_layout(bargap=0.1)
fig.show()
fig = px.histogram(df_train,
x='Outlet_Establishment_Year',
marginal='box',
nbins=32,
title='Distribution of Outlet_Establishment_Year')
fig.update_layout(bargap=0.1)
fig.show()
fig = px.histogram(df_train,
x='Item_Fat_Content',
marginal='box',
nbins=6,
title='Distribution of Item_Fat_Content')
fig.update_layout(bargap=0.1)
fig.show()
df_train.corr()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Item_Outlet_Sales | |
|---|---|---|---|---|---|
| Item_Weight | 1.000000 | -0.014048 | 0.027141 | -0.011588 | 0.014123 |
| Item_Visibility | -0.014048 | 1.000000 | -0.001315 | -0.074834 | -0.128625 |
| Item_MRP | 0.027141 | -0.001315 | 1.000000 | 0.005020 | 0.567574 |
| Outlet_Establishment_Year | -0.011588 | -0.074834 | 0.005020 | 1.000000 | -0.049135 |
| Item_Outlet_Sales | 0.014123 | -0.128625 | 0.567574 | -0.049135 | 1.000000 |
It can be noticed that Item MRP and Outlet sales have some kind of correlation and we saw the same from the scatterplot
sns.heatmap(df_train.corr(),cmap='Reds',annot=True)
plt.title('Corelation Matrix');
#Standardising the values